################################
## Generalization Error
## estimation for fraud paper
################################

## Set RNG seed
set.seed(831213)

## Get processed data
load("./FullData.RData")

## Set MC parallel backend
registerDoMC(3)

## Obtain best parameters from parameter sweep
bartParams <- read.csv("./BSParamSweep.csv"
                       ,nrows=1
)[,1:6]
bartParams$burnin  <- 50e3
bartParams$niter <- 5e3
bartParams$keepevery <- 10
bartParams$sampind  <- 1
bartParams  <- as.matrix(bartParams)


## The following calculates
## generalization error using 
## test set. 

# errors.one  <- llply(1:3
#                      ,.fun=function(x){err.generator(1
#                                                      ,bartParams
#                                                      ,all.data[[x]]
#                                                      ,parallel=FALSE
#                                                      ,lasso=FALSE
#                      )}
#                      ,.parallel=TRUE)

## Uncomment to save estimation results for later use.
#save(errors.one,file="./GenErrorTestSet.RData")
load("./GenErrorTestSet.RData")


## Check convergence of sigma parameter (footnote 11)
plot(errors.one[[1]][["bart.res"]][["1"]][["bartres"]][["sigma"]]
     ,xlab="Interation"
     ,ylab=expression(sigma^2)
     ,pch=19)
length(errors.one[[1]][["bart.res"]][["1"]][["bartres"]][["sigma"]])

## Table 2
out.mean <- rbind(errors.one[[1]]$error
             ,errors.one[[2]]$error
             ,errors.one[[3]]$error)
out.median  <- rbind(errors.one[[1]]$error.hack
                   ,errors.one[[2]]$error.hack
                   ,errors.one[[3]]$error.hack)
out <- cbind(out.mean[,c(1,2,5)],out.median[,c(1,2)])
rownames(out) <- c("Full", "Foren", "Subst")
out  <- as.data.frame(out)
out[,"sqrErr"]  <- sqrt(out[,"sqrErr"])
rankings <- colwise(rank)(out)
out$ConsRank  <- rowMeans(rankings)
print(round(out[,c(3,1,2,4,5,6)],3))


## Comparison to LM and LASSO (footnote 12)
lm.manual <- function(x){
lm.full <- lm(fraud.score~.-uid-train-fraud.democracy,data=subset(as.data.frame(all.data[[x]]),train==TRUE))
lm.full.preds  <- predict(lm.full,subset(as.data.frame(all.data[[x]]),train==FALSE))
absErr  <- abs(all.data[[x]][all.data[[x]][,"train"]==FALSE,"fraud.score"]-lm.full.preds)
ape  <- absErr/abs(all.data[[x]][all.data[[x]][,"train"]==FALSE,"fraud.score"])
rmse  <- sqrt(mean(absErr^2))
}
round(laply(1:3,lm.manual),3)

lasso.manual  <- function(x){
  train.data  <- as.matrix(subset(as.data.frame(all.data[[x]]),train==TRUE,select=-c(uid,train,fraud.democracy,fraud.score)))
  test.data  <- as.matrix(subset(as.data.frame(all.data[[x]]),train==FALSE,select=-c(uid,train,fraud.democracy,fraud.score)))
  train.y  <- all.data[[x]][all.data[[x]][,"train"]==TRUE,"fraud.score"]
  test.y  <- all.data[[x]][all.data[[x]][,"train"]==FALSE,"fraud.score"]
  cv.lasso  <- cv.glmnet(train.data,train.y)
  test.y.hat  <- predict(cv.lasso,test.data, s=cv.lasso$lambda.min)
  rmse <- sqrt(mean((test.y-test.y.hat)^2))
}
round(laply(1:3,lasso.manual),3)



